Lab 4:Mapping causal & predictive approaches

Practice session

July 27, 2024

GOAL OF TODAY’S PRACTICE SESSION



The examples and datasets in this Lab session follow very closely two sources:

Topics discussed in Lecture # 4

Lecture 4: topics

  • ….

R ENVIRONMENT SET UP & DATA

Needed R Packages

  • We will use functions from packages base, utils, and stats (pre-installed and pre-loaded)
  • We may also use the packages below (specifying package::function for clarity).
# Load pckgs for this R session

# --- General 
library(here) # A Simpler Way to Find Your Files   
library(skimr)    # Compact and Flexible Summaries of Data
library(paint)   # Fancy structure for data frames 
library(janitor)  # Simple Tools for Examining and Cleaning Dirty Data
library(tidyverse) # Easily Install and Load the 'Tidyverse'

# --- Plotting & data visualization
library(ggfortify)     # Data Visualization Tools for Statistical Analysis Results
library(ggpubr)        # 'ggplot2' Based Publication Ready Plots
library(dagitty) # Graphical Analysis of Structural Causal Models
library(ggdag) # Analyze and Create Elegant Directed Acyclic Graphs #
library(patchwork) # The Composer of Plots

# --- Descriptive statistics and regressions
library(broom) # Convert Statistical Objects into Tidy Tibbles
library(Hmisc) # Harrell Miscellaneous
library(estimatr) # Fast Estimators for Design-Based Inference
library(modelsummary) # Summary Tables and Plots for Statistical Models and Data:
library(sandwich) #for robust variance estimation
library(survey) # Analysis of Complex Survey Samples
library(haven) # Import and Export 'SPSS', 'Stata' and 'SAS' Files
library(simstudy) # Simulation of Study Data
library(NHANES) # Data from the US National Health and Nutrition Examination Study  

ITE

Individual Treatment Effect (ITE) is defined as the difference in potential outcomes for a given individual \(ITE_i= y_{i}^1 − y_{i}^0\)

The Causal Effect (CE)

\(CE_i = Y_{1i} - Y_{0i}\) where \(Y_{1i}\) and \(Y_{0i}\) are the potential outcomes for each individual \(i\) being both exposed and not exposed respectively (not possible in the real world).

Simulation

def <- simstudy::defData(varname = "C", formula = 0.4, dist = "binary")
def <- simstudy::defData(def, "X", formula = "0.3 + 0.4 * C", dist = "binary")
def <- simstudy::defData(def, "e", formula = 0, variance = 2, dist = "normal")
def <- simstudy::defData(def, "Y0", formula = "2 * C + e", dist="nonrandom")
def <- simstudy::defData(def, "Y1", formula = "1 + 2 * C + e", dist="nonrandom")
def <- simstudy::defData(def, "Y_obs", formula = "Y0 + (Y1 - Y0) * X", dist = "nonrandom") #  = Y1*X + (1-X)*Y0

set.seed(2017)
dt <- genData(10000, def)
paste0("Calculated mean causal effect = ", mean(dt[, Y1] - dt[, Y0])) #mean difference of counterfactual outcomes
[1] "Calculated mean causal effect = 1"
paste0("Calculated observed difference  = ", round(dt[X == 1, mean(Y_obs)] - dt[X == 0, mean(Y_obs)],2))
[1] "Calculated observed difference  = 1.71"
lm1 <- lm(Y_obs ~ X, data = dt)
tidy(lm1)
# A tibble: 2 × 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)    0.481    0.0225      21.3 8.84e-99
2 X              1.71     0.0335      51.1 0       

DAG with COLLIDER

# df DAG 
dag_collid <- dagify(
  Y ~ X + e,
  Z ~ X + Y,
  exposure = "X",
  outcome = "Y",
  #latent = "e",
  # labels
  labels = c(
    "Z" = "Collider",
    "X" = "Exposure",
    "Y" = "Outcome",
    "e" = "Unobserved \nerror"),
  # positions
  coords = list(
    x = c(X = 0, Y= 4, Z = 2 , e = 5),
    y = c(X = 0, Y = 0, Z = 2, e = 1) 
  )) %>% 
  tidy_dagitty() 

# Plot DAG 
dag_col_p <-  dag_collid %>% 
  ggdag(., layout = "circle") +  # decided in dagify
  theme_dag_blank(plot.caption = element_text(hjust = 1)) +
  # Nodes 
  geom_dag_node(aes(color = name)) +  
  # Labels dodged to avoid overlap
  geom_dag_label(aes(label = label), color = "#4c4c4c", nudge_x = 0.7, nudge_y = 0.2) +  
  
  geom_dag_text(color="white") +
  labs(title = "Causal map with COLLIDER (Z)" , 
       #subtitle = "X = exposure, Y = outcome, Z = collider, e = random error", 
       caption = ) +
  # Map colors to specific nodes
  scale_color_manual(values = c("X" = "#9b6723", "Y" = "#72aed8", "Z" = "#d02e4c",
                                "e" = "#A6A6A6"), 
                     guide = "none")    # Remove legend

dag_col_p

DAG with COLLIDER

DAG with CONFOUNDER

library(dagitty) # Graphical Analysis of Structural Causal Models # Graphical Analysis of Structural Causal Models # Graphical Analysis of Structural Causal Models 
library(ggdag) # Analyze and Create Elegant Directed Acyclic Graphs # Analyze and Create Elegant Directed Acyclic Graphs # Analyze and Create Elegant Directed Acyclic Graphs 

# df DAG 
dag_confounder <- dagify(
  Y ~ X + Z + e,
  X ~  Z,
  exposure = "X",
  outcome = "Y",
  #latent = "e",
  # labels
  labels = c(
    "Z" = "Confounder",
    "X" = "Exposure",
    "Y" = "Outcome",
    "e" = "Unobserved \nerror"),
  # positions
  coords = list(
    x = c(X = 0, Y= 4, Z = 2 , e = 5),
    y = c(X = 0, Y = 0, Z = 2, e = 1) 
  )) %>% 
  tidy_dagitty()     # Add edge_type within pipe

# Plot DAG 
dag_conf_p <-  dag_confounder %>% 
  ggdag(., layout = "circle") + 
  theme_dag_blank(plot.caption = element_text(hjust = 1)) +
  # Nodes 
  geom_dag_node(aes(color = name)) +  
  # Labels dodged to avoid overlap
  geom_dag_label(aes(label = label), color = "#4c4c4c", nudge_x = 0.7, nudge_y = 0.2) +  
  geom_dag_text(color="white") +
  labs(title = "Causal map with CONFOUNDER (Z)" , #subtitle = " ",
        caption = ) +
  # Map colors to specific nodes
  scale_color_manual(values = c("X" = "#9b6723", "Y" = "#72aed8", "Z" = "#d02e4c",
                                "e" = "#A6A6A6"), 
                     guide = "none")   

dag_conf_p

DAG with CONFOUNDER

DAG with MEDIATOR

library(dagitty) # Graphical Analysis of Structural Causal Models # Graphical Analysis of Structural Causal Models # Graphical Analysis of Structural Causal Models 
library(ggdag) # Analyze and Create Elegant Directed Acyclic Graphs # Analyze and Create Elegant Directed Acyclic Graphs # Analyze and Create Elegant Directed Acyclic Graphs 

# df DAG 
dag_med <- dagify(
  Y ~ Z + e,
  Z ~ X,
  
  exposure = "X",
  outcome = "Y",
  #latent = "e",
  # labels
  labels = c(
    "Z" = "Mediator",
    "X" = "Exposure",
    "Y" = "Outcome",
    "e" = "Unobserved \nerror"),
  # positions
  coords = list(
    x = c(X = 0, Y= 4, Z = 2 , e = 5),
    y = c(X = 0, Y = 0, Z = 2, e = 1) 
  )) %>% 
  tidy_dagitty()     # Add edge_type within pipe

# Plot DAG 
dag_med_p <-  dag_med %>% 
  ggdag(., layout = "circle") +  # decided in dagify
  theme_dag_blank(plot.caption = element_text(hjust = 1)) +
  # Nodes 
  geom_dag_node(aes(color = name)) +  
  # Labels dodged to avoid overlap
  geom_dag_label(aes(label = label), color = "#4c4c4c", nudge_x = 0.7, nudge_y = 0.2) +  
  
  geom_dag_text(color="white") +
  labs(title = "Causal map with MEDIATOR (Z)" , 
       #subtitle = "X = exposure, Y = outcome, Z = collider, e = random error", 
       caption = ) +
  # Map colors to specific nodes
  scale_color_manual(values = c("X" = "#9b6723", "Y" = "#72aed8", "Z" = "#d02e4c",
                                "e" = "#A6A6A6"), 
                     guide = "none")    # Remove legend

dag_med_p

DAG with MEDIATOR

—-

DATASETS for today

Importing Dataset 1 (NHANES)

Name: NHANES, accessible from package NHANES Documentation: See reference on the data downloaded and …. Sampling details: We’ll use a subset of the NHANES dataset, focusing on variables related to smoking status (SmokeNow), systolic blood pressure (BPSysAve), Body Mass Index (BMI) and age (Age).

data(NHANES)
# Select relevant columns and drop rows with missing values
nhanes_data <- NHANES %>%
  select(ID, Gender, Age, Race1, Height, Weight, BMI, SmokeNow, PhysActive, PhysActiveDays,
         BPSysAve, BPSysAve, BPDiaAve, TotChol, DirectChol, Diabetes, HealthGen,
         Education, HHIncomeMid, Poverty) %>%
  drop_na() %>% 
  # make it smaller 
  slice_sample(n = 1000) # Randomly select 1000 rows using

# Take a quick look at the data
#paint(nhanes_data)

Dataset 1 (NHANES) Variables and their description

[EXCERPT: see complete file in Input Data Folder]

Variable Type Description
X int xxxx
ID int xxxxx
SurveyYr chr yyyy_mm. Ex. 2011_12
Gender chr Gender (sex) of study participant coded as male or female
Age int ##
AgeDecade chr yy-yy es 20-29
Race1 chr Reported race of study participant: Mexican, Hispanic, White, #Black, or Other
Education chr [>= 20 yro]. Ex. 8thGrade, 9-11thGrade, HighSchool, SomeCollege, or CollegeGrad.
HHIncome chr Total annual gross income for the household in US dollars
HHIncomeMid int Numerical version of HHIncome derived from the middle income # in each category. Ex. 12500 40000
Poverty dbl A ratio of family income to poverty guidelines. Smaller # numbers indicate more poverty Ex.. 0.95 1.74 4.99
Weight dbl Weight in kg
Height dbl Standing height in cm. Reported for participants aged 2 years or older.
BMI dbl Body mass index (weight/height2 in kg/m2). Reported for participants aged 2 years or older
Pulse int 60 second pulse rate
BPSysAve int Combined systolic blood pressure reading, following the # procedure outlined for BPXSAR
BPDiaAve int Combined diastolic blood pressure reading, following the # procedure outlined for BPXDAR
DirectChol dbl Direct HDL cholesterol in mmol/L. Reported for participants aged 6 years or older
TotChol dbl Total HDL cholesterol in mmol/L. Reported for participants aged 6 years or older
Diabetes chr Study participant told by a doctor or health professional that they have diabetes
HealthGen chr Self-reported rating of health: Excellent, Vgood, Good, Fair, or Poor Fair
PhysActive chr Participant does moderate or vigorous-intensity sports, fitness or recreational activities (Yes or No).
SmokeNow chr Study participant currently smokes cigarettes regularly. (Yes or No)
... ... ...

https://bookdown.org/jbrophy115/bookdown-clinepi/confounding.html

Causal modeling with CONFOUNDER

  • Z = Age = confounder for (X = SmokeNow) and blood pressure (Y = BPSysAve)
data(NHANES)
# Select relevant columns and drop rows with missing values
nhanes_conf <- NHANES %>%
  select(ID, Age, SmokeNow, BPSysAve) %>%
  drop_na()
  
# Take a quick look at the data
paint(nhanes_conf)
tibble [3108, 4]
ID       int 51624 51624 51624 51630 51654 51666
Age      int 34 34 34 49 66 58
SmokeNow fct No No No Yes No Yes
BPSysAve int 113 113 113 112 111 127

In this context:

  • Age influences both smoking (SmokeNow) and blood pressure (BPSysAve), making it a confounder.
  • SmokeNow directly affects BPSysAve.

Causal modeling with CONFOUNDER (viz)

# linear rel 
qplot (Age, BPSysAve, data = nhanes_conf, color = SmokeNow) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Scatterplot of Age and Blood Pressure by Smoking Status",
       x = "Age",
       y = "Blood Pressure (mmHg)",
       color = "Smoking Status")

Causal modeling with CONFOUNDER (viz)

— Regression analysis - unadjusted model

# Unadjusted linear model 
model_unadjusted <- lm(BPSysAve ~ SmokeNow, data = nhanes_conf)
summary(model_unadjusted)

Call:
lm(formula = BPSysAve ~ SmokeNow, data = nhanes_conf)

Residuals:
   Min     1Q Median     3Q    Max 
-44.38 -12.12  -2.38   8.62 100.88 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  124.384      0.432  287.97  < 2e-16 ***
SmokeNowYes   -4.264      0.643   -6.64  3.8e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 17.8 on 3106 degrees of freedom
Multiple R-squared:  0.014, Adjusted R-squared:  0.0137 
F-statistic:   44 on 1 and 3106 DF,  p-value: 3.8e-11

— Regression analysis - adjusted model

# Adjusted model
model_adjusted <- lm(BPSysAve ~ SmokeNow + Age, data = nhanes_conf)
summary(model_adjusted)

Call:
lm(formula = BPSysAve ~ SmokeNow + Age, data = nhanes_conf)

Residuals:
   Min     1Q Median     3Q    Max 
-53.81  -9.87  -1.38   8.65  97.73 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 100.9969     1.0886   92.78   <2e-16 ***
SmokeNowYes   0.6840     0.6313    1.08     0.28    
Age           0.4317     0.0187   23.09   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 16.5 on 3105 degrees of freedom
Multiple R-squared:  0.158, Adjusted R-squared:  0.158 
F-statistic:  292 on 2 and 3105 DF,  p-value: <2e-16

— Compare models

  • adjusting for age changes the estimated effect of smoking on blood pressure
    • Unadjusted Model: This model does not control for age, so any observed relationship between smoking and blood pressure might be confounded.
    • Adjusted Model: This model controls for age, providing a more accurate estimate of the causal effect of smoking on blood pressure by accounting for the confounding influence of age.
conf_sum <- modelsummary::msummary(list( "NO Confounder" = model_unadjusted, 
                                         "Confounder" = model_adjusted), 
                                   fmt="%.3f", # format
                                   gof_omit = 'DF|Deviance|Log.Lik.|F|R2 Adj.|AIC|BIC',
                                   stars=c('*' = .05, '**' = .01)#,
                                   #coef_rename = cm,
                                   #output = paste(output,"ch19_reg-R.tex",sep="")
) 

— Compare models

conf_sum
NO Confounder Confounder
* p
(Intercept) 124.384** 100.997**
(0.432) (1.089)
SmokeNowYes -4.264** 0.684
(0.643) (0.631)
Age 0.432**
(0.019)
Num.Obs. 3108 3108
R2 0.014 0.158
RMSE 17.82 16.47

— Add Predicted Values to the Dataset

# Add predicted values from the unadjusted and adjusted models
nhanes_conf <- nhanes_conf %>%
  mutate(
    pred_unadjusted = predict(model_unadjusted),  # Predicted BPSysAve from unadjusted model
    pred_adjusted = predict(model_adjusted)       # Predicted BPSysAve from adjusted model
  )

— Plot results 1/2

  • Reshape the Data Using pivot_longer()

We will reshape the dataset so we can plot the predicted values from both models in a faceted plot.

# Reshape the data into long format using pivot_longer()
nhanes_long <- nhanes_conf %>%
  pivot_longer(cols = c(pred_unadjusted, pred_adjusted), 
               names_to = "model", values_to = "predicted") %>%
  mutate(model = recode(model, 
                        pred_unadjusted = "Unadjusted: SmokeNow on BPSysAve", 
                        pred_adjusted = "Adjusted: SmokeNow + Age on BPSysAve"))

— Plot results 2/2

# Violin plot with dashed line connecting the two conditions
ggplot(nhanes_long, aes(x = factor(SmokeNow), y = BPSysAve, fill = model)) +
  
  # Create the violin plot to show the distribution of blood pressure
  geom_point(color = "grey", alpha = 0.4, position = position_dodge(width = 0.3) ) +
  
  # Overlay the predicted dashed line between the two smoking conditions
  geom_line(aes(y = predicted, group = model, color = model), 
            size = 0.8, linetype = "dashed", position = position_dodge(width = 0.3)) +
  
  # Facet vertically for unadjusted and adjusted models
  facet_wrap(model ~ ., scales = "free_y",ncol = 2) +
  
  # Add labels and title
  labs(title = "Confounder Analysis: Predicted values in Unadjusted vs Adjusted Regression models",
       subtitle = "Age = Confounder",
       x = "Smoking Status",
       y = "Systolic Blood Pressure (BPSysAve)",
       color = "Model",
       fill = "Model"
       ) +
  
  # Customize colors for better contrast
  scale_fill_manual(values = c("Unadjusted: SmokeNow on BPSysAve" ="lightcoral" , 
                               "Adjusted: SmokeNow + Age on BPSysAve" = "lightblue")) +
  scale_color_manual(values = c("Unadjusted: SmokeNow on BPSysAve" = "red", 
                                "Adjusted: SmokeNow + Age on BPSysAve" = "blue")) +
  
  # Minimal theme for clarity
  theme_minimal()+
  theme(legend.position = "none")

— Plot results 2/2

Causal modeling with MEDIATOR

  • M = BMI = mediator for the effect of X= PhysActiveDays on Y = BPSysAve
data(NHANES)
# Select relevant columns and drop rows with missing values
nhanes_med <- NHANES %>%
  select(Gender, Age, BPSysAve, BMI, PhysActiveDays, Diabetes, SmokeNow) %>%
  drop_na()

# Take a quick look at the data
summary(nhanes_med)
    Gender         Age          BPSysAve        BMI       PhysActiveDays
 female:632   Min.   :20.0   Min.   : 81   Min.   :15.0   Min.   :1.0   
 male  :831   1st Qu.:34.0   1st Qu.:111   1st Qu.:23.6   1st Qu.:2.0   
              Median :47.0   Median :119   Median :27.0   Median :3.0   
              Mean   :47.8   Mean   :122   Mean   :27.9   Mean   :3.6   
              3rd Qu.:60.0   3rd Qu.:131   3rd Qu.:31.3   3rd Qu.:5.0   
              Max.   :80.0   Max.   :221   Max.   :59.1   Max.   :7.0   
 Diabetes   SmokeNow 
 No :1312   No :834  
 Yes: 151   Yes:629  
                     
                     
                     
                     

— Regression analysis - unadjusted (total) model

Before adjusting for any mediator, we fit a simple linear model to estimate the total effect total effect of BMI on Systolic Blood Pressure without considering Age.

  • Total Effect Model (Unadjusted) .
# Unadjusted model: total effect of smoking on blood pressure
total_effect_model <- lm(BPSysAve ~ PhysActiveDays, data = nhanes_med)
summary(total_effect_model)

Call:
lm(formula = BPSysAve ~ PhysActiveDays, data = nhanes_med)

Residuals:
   Min     1Q Median     3Q    Max 
-43.25 -11.71  -3.07   8.11  98.52 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)     120.120      1.055  113.88   <2e-16 ***
PhysActiveDays    0.590      0.262    2.25    0.025 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 18.1 on 1461 degrees of freedom
Multiple R-squared:  0.00345,   Adjusted R-squared:  0.00277 
F-statistic: 5.06 on 1 and 1461 DF,  p-value: 0.0246

This model gives the total effect of of PhysActiveDays on Systolic Blood Pressure (including any indirect effects through M = BMI or other variables that haven’t been included.)

— Regression analysis - MEDIATOR model

Now we will set up the mediation models:

  • Mediator model: This model estimates the effect of physical activity PhysActiveDays on BMI BMI (= M). This allows us to see whether more physically active days are associated with lower BMI values (which could then affect blood pressure).
# Mediator model: SmokeNow affects BMI
mediator_model <- lm(BMI ~ PhysActiveDays, data = nhanes_med)
summary(mediator_model)

Call:
lm(formula = BMI ~ PhysActiveDays, data = nhanes_med)

Residuals:
    Min      1Q  Median      3Q     Max 
-12.705  -4.335  -0.943   3.474  31.129 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)     28.2176     0.3444   81.93   <2e-16 ***
PhysActiveDays  -0.0821     0.0856   -0.96     0.34    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.91 on 1461 degrees of freedom
Multiple R-squared:  0.00063,   Adjusted R-squared:  -5.44e-05 
F-statistic: 0.92 on 1 and 1461 DF,  p-value: 0.338
  • Outcome Model (Adjusted for BMI): This model estimates the effect of PhysActiveDays and BMI on Systolic Blood Pressure (BPSysAve). This is the adjusted model, where we adjust for the mediator (BMI).
# Outcome model: PhysActiveDays and BMI affect BPSysAve
outcome_model <- lm(BPSysAve ~  PhysActiveDays + BMI, data = nhanes_med)
summary(outcome_model)

Call:
lm(formula = BPSysAve ~ PhysActiveDays + BMI, data = nhanes_med)

Residuals:
   Min     1Q Median     3Q    Max 
-42.10 -11.64  -3.06   7.90 101.84 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    109.3530     2.4763   44.16  < 2e-16 ***
PhysActiveDays   0.6213     0.2603    2.39    0.017 *  
BMI              0.3816     0.0795    4.80  1.8e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 18 on 1460 degrees of freedom
Multiple R-squared:  0.0189,    Adjusted R-squared:  0.0176 
F-statistic: 14.1 on 2 and 1460 DF,  p-value: 8.78e-07

— Compare models

med_sum <- modelsummary::msummary(list("BMI ~ Mediator Effect" = mediator_model,
                                       "BPSysAve ~ Total Effect" = total_effect_model,
                                       "BPSysAve ~ Outcome" = outcome_model), 
                                  fmt="%.3f", # format
                                  gof_omit = 'DF|Deviance|Log.Lik.|F|R2 Adj.|AIC|BIC',
                                  stars=c('*' = .05, '**' = .01)#,
                                  #coef_rename = cm,
                                  #output = paste(output,"ch19_reg-R.tex",sep="")
) 

— Compare models

med_sum
BMI ~ Mediator Effect BPSysAve ~ Total Effect BPSysAve ~ Outcome
* p
(Intercept) 28.218** 120.120** 109.353**
(0.344) (1.055) (2.476)
PhysActiveDays -0.082 0.590* 0.621*
(0.086) (0.262) (0.260)
BMI 0.382**
(0.080)
Num.Obs. 1463 1463 1463
R2 0.001 0.003 0.019
RMSE 5.90 18.08 17.94

— Calculate the Indirect, Direct, and Total Effects

Now that we have the coefficients from the models, we can calculate the mediation effects manually.

  • Direct Effect: This is the coefficient of PhysActiveDays in the outcome model, which gives the direct effect of exercise on blood pressure (controlling for BMI = M)
  • Indirect Effect: This is the product of the coefficient from the mediator model (PhysActiveDaysBMI) and the coefficient from the outcome model (BMIBPSysAve).
  • Total Effect: This is the coefficient of PhysActiveDays in the unadjusted model (NOT controlling for BMI = M).
# Check the names of the coefficients in models...
names(coef(mediator_model))     # [1] BMI ~ PhysActiveDays 
[1] "(Intercept)"    "PhysActiveDays"
names(coef(outcome_model))      # [1] BPSysAve ~  PhysActiveDays + BMI    
[1] "(Intercept)"    "PhysActiveDays" "BMI"           
names(coef(total_effect_model)) # [1] BPSysAve ~ PhysActiveDays
[1] "(Intercept)"    "PhysActiveDays"
# BREAKING DOWN THE ESTIMATE OF PhysActiveDays COEFFICIENT 

# TOTAL effect: The effect of PhysActiveDays on BPSysAve without adjusting for BMI
total_effect <- coef(total_effect_model)["PhysActiveDays"]
total_effect # 0.59
PhysActiveDays 
          0.59 
# DIRECT effect: The effect of PhysActiveDays on BPSysAve after adjusting for BMI.
direct_effect <- coef(outcome_model)["PhysActiveDays"]  # Direct effect (controlling for BMI)
direct_effect # 0.621 
PhysActiveDays 
         0.621 
# Indirect effect: The portion of the effect on BPSysAve that is mediated through BMI
indirect_effect <- coef(mediator_model)["PhysActiveDays"] * coef(outcome_model)["BMI"]   
indirect_effect # [-0.0821 X 0.382] = -0.0313 
PhysActiveDays 
       -0.0313 
# or 
total_effect - direct_effect # !!!!!!!!  -0.0313
PhysActiveDays 
       -0.0313 

— Proportion mediated

# Proportion mediated: The proportion of the total effect that is mediated through BMI
proportion_mediated <- glue::glue("{round(indirect_effect / total_effect *100, 2)}%")
proportion_mediated #  -0.0531
-5.31%

— Visualise the Indirect, Direct, and Total Effects

“+ Physical Activity” -> “+ Systolic Blood Pressure (BPSysAve)” | “+ BMI -> - BPSysAve” WTF ???

# Create a data frame for the effects
effects_df <- data.frame(
  Effect = c("Total Effect", "Direct Effect", "Indirect Effect"),
  Value = c(total_effect, direct_effect, indirect_effect)
)

# Load ggplot2
library(ggplot2)

# Create the bar plot
ggplot(effects_df, aes(x = Effect, y = Value, fill = Effect)) +
  geom_bar(stat = "identity") +
  labs(title = "Mediation Analysis: Total, Direct, and Indirect Effects", 
       subtitle = "Effect (coefficient) of Physical Activity on Systolic Blood Pressure (BPSysAve)",
       y = "", x = "") +
  theme_minimal()

— Interpret results

  • The total effect tells us the overall relationship between PhysActiveDays and BPSysAve.

  • The direct effect represents the relationship between PhysActiveDays and BPSysAve, controlling for BMI.

  • The indirect effect (mediated effect) shows how much of the relationship between PhysActiveDays and BPSysAve is explained through BMI (AKA the coef of PhysActiveDays in the mediator model times the coef of BMI in the outcome model)

  • The proportion mediated indicates how much of the total effect is due to the mediation by BMI.

— Calculate the Indirect, Direct, and Total Effects (modo2)

library(mediation)
# recall
mediator_model[["call"]][["formula"]]
BMI ~ PhysActiveDays
outcome_model[["call"]][["formula"]]
BPSysAve ~ PhysActiveDays + BMI
# Calculate the mediation effect
mediation_model <- mediate(mediator_model, outcome_model, 
                           treat = "PhysActiveDays", 
                           mediator = "BMI", 
                           boot=TRUE, sims=500)
summary(mediation_model)

Causal Mediation Analysis 

Nonparametric Bootstrap Confidence Intervals with the Percentile Method

               Estimate 95% CI Lower 95% CI Upper p-value   
ACME            -0.0313      -0.1073         0.02   0.304   
ADE              0.6213       0.1182         1.13   0.008 **
Total Effect     0.5899       0.0590         1.09   0.012 * 
Prop. Mediated  -0.0531      -0.5684         0.04   0.308   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Sample Size Used: 1463 


Simulations: 500 
#                Estimate 95% CI Lower 95% CI Upper p-value  
# ACME            -0.0313      -0.1024         0.04   0.324     = indirect or mediation effect
# ADE              0.6213       0.1479         1.16   0.024 *   = direct effect 
# Total Effect     0.5899       0.1354         1.15   0.020 *   = total effect 
# Prop. Mediated  -0.0531      -0.2969         0.10   0.344     = proportion mediated
# Add predicted values for BMI, BPSysAve (adjusted), and BPSysAve (total effect)
nhanes_med_pred <- nhanes_med %>%
  mutate(
    pred_bmi = predict(mediator_model),         # Predicted BMI (mediator model)
    pred_bps_adj = predict(outcome_model),      # Predicted BPSysAve (adjusted for Age)
    pred_bps_total = predict(total_effect_model) # Predicted BPSysAve (total effect)
  )

— Plot results 1/2

Now, let’s create a plot that includes:

  • The total effect of BMI on blood pressure (without adjusting for Age).
  • The adjusted effect of BMI on blood pressure (controlling for Age).
  • The mediator effect (BMI’s effect on Age).
# Check column names
colnames(nhanes_med_pred)
 [1] "Gender"         "Age"            "BPSysAve"       "BMI"           
 [5] "PhysActiveDays" "Diabetes"       "SmokeNow"       "pred_bmi"      
 [9] "pred_bps_adj"   "pred_bps_total"
# Reshape the data into long format for faceting
nhanes_long <- nhanes_med_pred %>%
  gather(key = "model", value = "predicted",  pred_bps_adj, pred_bps_total) %>%
  mutate(model = recode(model, 
                        #pred_bmi = "Mediator Effect: BPSysAve on BMI", 
                        pred_bps_adj = "Adjusted Effect: PhysActiveDays + BMI on BPSysAve", 
                        pred_bps_total = "Total Effect: PhysActiveDays on BPSysAve"))

— Plot results 2/2

# Plot with vertically aligned facets
ggplot(nhanes_long, aes(x = BMI)) +
  geom_point(aes(y = BPSysAve), color = "black", shape = 16, alpha = 0.5) +  # Actual data points for BPSysAve
  geom_line(aes(y = predicted, color = model), size = 1) +  # Predicted values from different models
  facet_grid(model ~ ., scales = "free_y") +  # Facet vertically
  labs(title = "Mediation Analysis: Total, Adjusted, and Mediator Effects",
       x = "BMI",
       y = "Outcome / Mediator",
       color = "Effect Type") +
  theme_minimal()

— Plot results 2/2

—-

QUI

Equitable Equations !!!!!

Importing Dataset 2 (Gabor)

Name:
Documentation: .. Sampling details:

# import data
data <- read_csv(paste0(data_in,"food-health.csv")) # 164,848 

# drop observations
dat <-  data %>%
  filter(age >= 30 & age <60) %>% # 7930 
  # new variables: 
  ## Fruit and vegetables per day (grams)
  ## Blood pressure (systolic+diastolic)
  mutate(fv=veggies_n_fruits_gr,
         bp=blood_pressure) %>%
  filter(fv<3200) %>%
  drop_na(bp) %>% # 7358
  # Days per week exercising 
  mutate(exerc=case_when(paq655 <=7 ~ paq655,
                         paq650 ==2 ~ 0)) %>% 
  # Potato chips per day, grams
  mutate(pchips=gr_potato_chips) %>% 
# select variables
  select(id = seqn, age, gender, weight, height, race, 
         fv, exerc, pchips, 
         bp_systolic, bp_diastolic, bp, bmi,  
         smoker, total_cholesterol, hdl,
         hh_income_usd, hh_income_percap, ln_hh_income_percap ,income_cat
                  ) %>% 
  drop_na() # 7358
 

# Hmisc::describe(dat$hh_income)
paint(dat)

Dataset 2 (Gabor) Variables and their description

[EXCERPT: see complete file in Input Data Folder]

https://bookdown.org/jbrophy115/bookdown-clinepi/confounding.html

Causal modeling with CONFOUNDER

# Select relevant columns and drop rows with missing values
dat_conf <- dat %>%
  select(id, age, bp_systolic, fv, exerc, bmi, smoker, gender, hh_income_usd, ln_hh_income_percap ) %>%
  drop_na() %>% 
  filter(fv <=1500)

# Take a quick look at the data
summary(dat_conf)

Hmisc::describe(dat_conf$bp_systolic) # Y
Hmisc::describe(dat_conf$bmi) # X
Hmisc::describe(dat_conf$age) # Z (confounder )
Hmisc::describe(dat_conf$exerc)  # Z (confounder )
Hmisc::describe(dat_conf$ln_hh_income_percap) # Z (confounder )

In this context:

  • exerc influences both smoking (fv) and blood pressure (bp_systolic), making it a confounder (albeit as PROXY for socio-economic status)
  • fv directly affects bp_systolic

— Regression analysis

# Unadjusted model
model_unadjusted <- lm( bp_systolic ~ bmi + ln_hh_income_percap , data = dat_conf)
summary(model_unadjusted)

# Adjusted model
model_adjusted <- lm(bp_systolic ~ bmi + age + ln_hh_income_percap, data = dat_conf)
summary(model_adjusted)
  • adjusting for Z changes the estimated effect of X on Y
    • Unadjusted Model: This model does not control for Z, which is a confounder of the relationship between X and Y.
    • Adjusted Model: This model controls for Z, providing a more accurate estimate of the causal effect of X on Y.

— Add Predicted Values to the Dataset

model_unadjusted$coefficients 
model_adjusted$coefficients

# Get predicted values using augment and attach to the original dataset
dat_unadjusted <- broom::augment(model_unadjusted, data = dat_conf) %>%
  select( -.resid,  -.hat, -.sigma, -.cooksd, -.std.resid ) %>%
  rename(pred_unadjusted = .fitted)

dat_adjusted <- broom::augment(model_adjusted, data = dat_conf) %>%
  select( -.resid,     -.hat, -.sigma, -.cooksd, -.std.resid ) %>%
  rename(pred_adjusted = .fitted)

# Combine the predicted values from both models
dat_conf_long <- dat_unadjusted %>%
  left_join(dat_adjusted, by = "id", suffix=c("",".y"))%>%
  select(-ends_with(".y")) %>% 
  # Convert the data from wide to long format
  pivot_longer(cols = c(pred_unadjusted, pred_adjusted), 
               names_to = "model", 
               values_to = "predicted")

# Check the structure of the long dataset
str(dat_conf_long)

— Plot results 1/2

  • Reshape the Data Using pivot_longer()

We will reshape the dataset so we can plot the predicted values from both models in a faceted plot.

# check 
dat_conf_long %>% group_by(model) %>% 
  summarise(mean(predicted), sd(predicted))

— Plot results 2/2

ggplot(dat_conf, aes(x = bmi, y = bp_systolic)) +
  geom_point(color = "grey", alpha = 1) +  
  geom_point(data = dat_unadjusted, aes(x = bmi, y = pred_unadjusted), 
             color = "#e60066", alpha = 0.20) +
  geom_point(data = dat_adjusted, aes(x = bmi, y = pred_adjusted), 
             color = "#399B23", alpha = 0.20) +
  # Add labels and title
  labs(title = "Confounder Analysis: Unadjusted (pink) vs Adjusted (green) fitted values",
       x = "BMI",
       y = "Systolic Blood Pressure (BPSysAve)"
       )  +
    # Minimal theme for clarity
  theme_minimal() +
  theme(legend.position = "none")

— Plot results 2/2

— Different

https://nicholasrjenkins.science/post/data_viz_r/data_visualization_r/

— Regression analysis

# Unadjusted model
model_unadjusted <- lm( bp_systolic ~ bmi  , data = dat_conf)  
 # Adjusted model
model_adjusted <- lm(bp_systolic ~ bmi + age  , data = dat_conf)  
tidy(model_unadjusted, conf.int = TRUE ) %>% 
  ggplot(data = ., 
         mapping = aes(x = estimate, y = term, xmin = conf.low, xmax = conf.high)) +
  geom_pointrange() +
  labs(title = "Model Estimates of bmi on bp_systolic",
       x = "Coefficient Estimate",
       y = "Predictor",
       caption = "Models fit with OLS. Error bars show the 95% confidence interval.") +
  scale_y_discrete(labels = c("Intercept",  "bmi")) +
  ggpubr::theme_pubclean(flip = TRUE)
tidy(model_adjusted, conf.int = TRUE ) %>% 
  ggplot(data = ., 
         mapping = aes(x = estimate, y = term, xmin = conf.low, xmax = conf.high)) +
  geom_pointrange() +
  labs(title = "Model Estimates of bmi + age on bp_systolic",
       x = "Coefficient Estimate",
       y = "Predictor",
       caption = "Models fit with OLS. Error bars show the 95% confidence interval.") +
  scale_y_discrete(labels = c("Intercept", "age", "bmi")) +
  ggpubr::theme_pubclean(flip = TRUE)
fit_unadj_results <- tidy(model_unadjusted, conf.int = TRUE) %>% 
  mutate(model = "Model Unadjusted")

fit_adj_results <- tidy(model_adjusted, conf.int = TRUE) %>% 
  mutate(model = "Model Adjusted")

model_results <- bind_rows(fit_unadj_results, fit_adj_results)
ggplot(data = model_results, 
       aes(x = estimate, y = term, xmin = conf.low, xmax = conf.high, 
           color = model, shape = model)) +
  geom_pointrange(position = position_dodge(width = 0.5)) +
  labs(title = "Model Estimates of BMI and age on REM Sleep",
       x = "Coefficient Estimate",
       y = "Predictor",
       caption = "Models fit with OLS. Error bars show the 95% confidence interval.",
        color = "Model:",
       shape = "Model:") +
  scale_y_discrete(labels = c("Intercept", "age", "bmi")) +
  ggpubr::theme_pubclean(flip = FALSE)

Final thoughts

  • …..